<?php
$snoopy = new Snoopy();
$sourceURL = "http://www.bjcang.cn/article_cat-14.html";	//要采集文章列表的网址
$snoopy->fetch($sourceURL);
$fileContent = $snoopy->results;
$start_str = '<dl class="artile">';		//文章列表的开始标记
$end_str = '<div id="pager">';			//文章列表的结束标记
$start_num = stripos($fileContent, $start_str)+strlen($start_str);
$end_num = stripos($fileContent, $end_str)-$start_num;
$fileContent = substr($fileContent, $start_num, $end_num);
$fileContent = strip_tags($fileContent, '<a>');
$a = "/<a.*?href=[\"\'](.*?)[\"\'][^>]*>(.*?)<\/a>/i";
preg_match_all($a, $fileContent, $content);
foreach ($content[1] as $k=>$v){
	$content[1][$k] = FillUrl($sourceURL, $v);
}
print_r($content);
 
/**
 *  补全网址
 *
 * @access    public
 * @param     string  $refurl  来源地址
 * @param     string  $surl  站点地址
 * @return    string
 */
function FillUrl($refurl,$surl)
{
	$i = $pathStep = 0;
	$dstr = $pstr = $okurl = '';
	$refurl = trim($refurl);
	$surl = trim($surl);
	$urls = @parse_url($refurl);
	$basehost = ( (!isset($urls['port']) || $urls['port']=='80') ? $urls['host'] : $urls['host'].':'.$urls['port']);
 
	//$basepath = $basehost.(!isset($urls['path']) ? '' : '/'.$urls['path']);
	//由于直接获得的path在处理 http://xxxx/nnn/aaa?fdsafd 这种情况时会有错误，因此用其它方式处理
	$basepath = $basehost;
	$paths = explode('/',preg_replace("/^http:\/\//i", "", $refurl));
	$n = count($paths);
	for($i=1;$i < ($n-1);$i++)
	{
		if(!preg_match("/[\?]/", $paths[$i])) $basepath .= '/'.$paths[$i];
}
if(!preg_match("/[\?\.]/", $paths[$n-1]))
	{
	$basepath .= '/'.$paths[$n-1];
}
if($surl=='')
	{
	return $basepath;
}
	$pos = strpos($surl, "#");
	if($pos>0)
	{
	$surl = substr($surl, 0, $pos);
}
 
	//用 '/' 表示网站根的网址
	if($surl[0]=='/')
	{
	$okurl = $basehost.$surl;
}
	else if($surl[0]=='.')
	{
	if(strlen($surl)<=2)
	{
	return '';
	}
	else if($surl[1]=='/')
	{
	$okurl = $basepath.preg_replace('/^./', '', $surl);
	}
	else
	{
	$okurl = $basepath.'/'.$surl;
	}
}
else
{
	if( strlen($surl) < 7 )
	{
	$okurl = $basepath.'/'.$surl;
}
else if( preg_match("/^http:\/\//i",$surl) )
{
$okurl = $surl;
	}
	else
	{
	$okurl = $basepath.'/'.$surl;
}
}
$okurl = preg_replace("/^http:\/\//i", '', $okurl);
$okurl = 'http://'.preg_replace("/\/{1,}/", '/', $okurl);
return $okurl;
}
?>